import pandas as pd

states = pd.read_csv('/REDACTED/horizontal/data/raw/state_fips.csv')


# use ty2014 file to generate crosswalk between CBG and taxpayer_id

def cbgCodeTPDataset(tp,dropOriginalColumns = True, dropMissing=True):
	try: 
		assert((min([col in tp.columns for col in ['cfips','tract','block','state']]))==True)
	except: 
		print('Incorrect data columns')
	## drop if tract, block, or cfips is missing
	tp = tp.copy()
	pre_drop = tp.shape[0]
	if tp['cfips'].dtype=='int64':
		print('assuming ints for important vars')
		drop_mask = (tp.tract.isna())|(tp.block.isna())|(tp.cfips.isna())
	else:
		print('assuming strings for important vars')
		drop_mask = ((tp.tract=='.')|(tp.block=='.')|(tp.cfips=='.'))
	keep_mask = ~drop_mask
	#dropped = tp.loc[(((tp.tract=='.') | (tp.block=='.')) | (tp.cfips=='.'))].copy()
	#tp = tp.loc[(((tp.tract != ".") & (tp.block != ".")) & (tp.cfips != "."))]
	dropped = tp.loc[drop_mask]
	tp = tp.loc[keep_mask]
	post_drop = tp.shape[0]

	print(str(pre_drop - post_drop) + ' rows with missing data dropped.')

	## make sure state, county, tract, and block-level codes have the correct number of characters

	tp.cfips = tp.cfips.astype(str)
	tp.cfips = tp.cfips.apply(lambda x: x.zfill(3))

	tp.tract = tp.tract.astype(str)
	tp.tract = tp.tract.apply(lambda x: x.zfill(6))

	tp.block = tp.block.astype(str)
	tp.block = tp.block.apply(lambda x: x.zfill(4))

	states.fips = states.fips.astype(str)
	states.fips = states.fips.apply(lambda x: x.zfill(2))

	## generate dictionary of state fips, add state fips to tp

	fips_dict = dict(zip(states.postal_code, states.fips))
	tp['sfips'] = tp.state.map(fips_dict)
	dropped['sfips'] = None
	tp.sfips = tp.sfips.astype(str)

	## generate block group var

	tp['block1'] = [x[:1] for x in tp.block]
	tp.block1 = tp.block1.astype(str)

	## generate cbg

	tp['cbg'] = [w + x + y + z for w, x, y, z in zip(tp.sfips, tp.cfips, tp.tract, tp.block1)]
	dropped['cbg'] = '.'
	## drop columns we don't need anymore
	if dropOriginalColumns:
		tp = tp.drop(columns=['state', 'tract', 'block', 'cfips', 'sfips', 'block1'])
		dropped = dropped.drop(columns=['state','tract','block','cfips','sfips'])
	##  bring back missing rows if desired
	if not dropMissing:
		tp = pd.concat([tp,dropped],axis=0)
	return tp

 
    



